Analyze selection data using soluble Ephrin-B2 or -B3¶

In [1]:
# this cell is tagged as parameters for `papermill` parameterization
#input configs
altair_config = None
nipah_config = None

#input files
func_scores_E2_file = None
binding_E2_file = None
func_scores_E3_file = None
binding_E3_file = None

#output files
filtered_E2_binding_data = None
filtered_E3_binding_data = None

#output images
E2_binding_entry = None
E3_binding_entry = None
E2_binding_heatmap = None
E3_binding_heatmap = None
E2_E3_correlation = None
E2_E3_correlation_site = None
binding_by_site_plot = None

entropy_file = None
In [2]:
# Parameters
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
binding_E2_file = "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
filtered_E2_binding_data = "results/filtered_data/E2_binding_filtered.csv"
filtered_E3_binding_data = "results/filtered_data/E3_binding_filtered.csv"
E2_binding_entry = "results/images/E2_binding_entry.html"
E3_binding_entry = "results/images/E3_binding_entry.html"
E2_binding_heatmap = "results/images/E2_binding_heatmap.html"
E3_binding_heatmap = "results/images/E3_binding_heatmap.html"
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"
entropy_file = "results/entropy/entropy.csv"
binding_by_site_plot = "results/images/binding_by_site_plot.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")
Setup in correct directory
In [5]:
#hard paths in case don't want to run with snakemake
#
#altair_config = "data/custom_analyses_data/theme.py"
#nipah_config = "nipah_config.yaml"
#
##input files
#func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
#binding_E2_file = "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
#func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
#binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
#
##output files
#filtered_E2_binding_data = "results/filtered_data/E2_binding_filtered.csv"
#filtered_E3_binding_data = "results/filtered_data/E3_binding_filtered.csv"
#
###output images
#E2_binding_entry = "results/images/E2_binding_entry.html"
#E3_binding_entry = "results/images/E3_binding_entry.html"
#E2_binding_heatmap = None
#E3_binding_heatmap = None
#E2_E3_correlation = None
#E2_E3_correlation_site = None
#
#entropy_file = "results/entropy/entropy.csv"

Run config files to setup altair theme and config variables¶

In [6]:
if altair_config:
    with open(altair_config, 'r') as file:
        exec(file.read())

with open(nipah_config) as f:
    config = yaml.safe_load(f)

Make the E2/E3 dataframes, filter separately, then merge¶

In [7]:
e2 = pd.read_csv(binding_E2_file)
e2_func = pd.read_csv(func_scores_E2_file)
e3 = pd.read_csv(binding_E3_file)
e3_func = pd.read_csv(func_scores_E3_file)

def merge_func_binding_dfs(func,binding,name):
    df_int = pd.merge(
        binding,
        func,
        on=['site','mutant','wildtype'],
        suffixes=['_affinity','_cell_entry'],
        validate='one_to_one',
        how='outer'
    )
    df = df_int.rename(columns={'Ephrin binding_mean':'binding_mean','Ephrin binding_std':'binding_std','Ephrin binding_median':'binding_median'})
    
    #only filter func effects
    df_pre_filter = df[
        (df['mutant'] != '*') &
        (df['mutant'] != '-') &
        (df['site'] != 603) &
        (df['effect'] >= config['min_func_effect_for_binding']) &
        (df['times_seen_cell_entry'] >= config['func_times_seen_cutoff']) &
        (df['effect_std'] <= config['func_std_cutoff']) 
    ]
    #Now filter binding 
    df_post_filter = df_pre_filter[
        (df_pre_filter['times_seen_affinity'] >= config['min_times_seen_binding']) &
        (df_pre_filter['binding_std'] <= config['max_binding_std']) &
        (df_pre_filter['frac_models'] >= config['frac_models'])
    ]
    
    def plot_affinity_corr(df):
        if name == 'E2':
            color = '#1f4e79'
        else:
            color = '#ff7f0e'
        chart = alt.Chart(df).mark_point(size=30, color='black', opacity=0.2, filled=True).encode(
            x=alt.X('effect', title=f'{name} Cell Entry', axis=alt.Axis(grid=True)),
            y=alt.Y('binding_median', title=f'{name} Binding', axis=alt.Axis(grid=True)),
            tooltip=['site', 'wildtype', 'mutant'],
        ).properties(
            width=alt.Step(10),
            height=alt.Step(10),
        )
        return chart.display()
            
    def plot_times_seen_std(df):
        if name == 'E2':
            color = '#1f4e79'
        else:
            color = '#ff7f0e'
        chart = alt.Chart(df).mark_circle(size=20,color='black',opacity=0.2).encode(
            x=alt.X('times_seen_affinity',axis=alt.Axis(grid=True),title=f'Times Seen for {name}',scale=alt.Scale(type='log')),
            y=alt.Y('binding_std',title='Binding Std',axis=alt.Axis(grid=True)),
            tooltip=['site','times_seen_affinity','effect_std','effect']
        ).properties(
            height=alt.Step(10),
            width=alt.Step(10)
        )
        return chart.display()
    
    plot_affinity_corr(df)  
    entry_vs_binding = plot_affinity_corr(df_post_filter)
    #entry_vs_binding.display()
    #entry_vs_binding.save(entry_vs_binding)
    
    plot_times_seen_std(df)
    plot_times_seen_std(df_post_filter)
    
    #For pulling out low effect mutants for heatmaps later
    def store_filtered_info(df):
        df_filter = df[
            (df['mutant'] != '*') &
            (df['mutant'] != '-') &
            (df['site'] != 603) &
            (df['effect'] < config['min_func_effect_for_binding']) &
            (df['times_seen_cell_entry'] >= config['func_times_seen_cutoff']) &
            (df['effect_std'] <= config['func_std_cutoff']) 
        ]
        return df_filter
    df_low_effect_filter = store_filtered_info(df)


    
    return df_post_filter,df_low_effect_filter

df_E2_filter,df_E2_filter_missing = merge_func_binding_dfs(e2_func,e2,'EFNB2')
df_E3_filter,df_E3_filter_missing = merge_func_binding_dfs(e3_func,e3,'EFNB3')

def plot_corr_binding_entry(df,name):
        chart = alt.Chart(df).mark_point(size=30, color='black', opacity=0.2, filled=True).encode(
            x=alt.X('effect', title=f'{name} Cell Entry', axis=alt.Axis(grid=True)),
            y=alt.Y('binding_median', title=f'{name} Binding', axis=alt.Axis(grid=True)),
            tooltip=['site', 'wildtype', 'mutant'],
        ).properties(
            width=alt.Step(10),
            height=alt.Step(10),
        )
        return chart
E2_binding_entry_tmp = plot_corr_binding_entry(df_E2_filter,'EFNB2')
E2_binding_entry_tmp.save(E2_binding_entry)
E3_binding_entry_tmp = plot_corr_binding_entry(df_E3_filter,'EFNB3')
E3_binding_entry_tmp.save(E3_binding_entry)

#Save filtered dataframes for crystal structure mapping
df_E2_filter.to_csv(filtered_E2_binding_data,index=False)
df_E3_filter.to_csv(filtered_E3_binding_data,index=False)

#Now that they are filtered, merge EFNB2 and EFNB3
df_affinity_filter_merge = pd.merge(
    df_E2_filter,
    df_E3_filter,
    on=['site','wildtype','mutant'],
    suffixes=['_E2','_E3'],
    how='outer'
)

#Add columns that calculate difference between EFNB2 and EFNB3 cell entry and EFNB2 and EFNB3 binding.
df_affinity_filter_merge['func_effect_diff'] = (df_affinity_filter_merge['effect_E2'] - df_affinity_filter_merge['effect_E3']).abs()
df_affinity_filter_merge['binding_effect_diff'] = (df_affinity_filter_merge['binding_mean_E2'] - df_affinity_filter_merge['binding_mean_E3']).abs()

#display stats
display(df_affinity_filter_merge[['binding_std_E2','binding_std_E3','binding_median_E2','binding_median_E3']].describe())
binding_std_E2 binding_std_E3 binding_median_E2 binding_median_E3
count 6657.000000 6563.000000 6657.000000 6563.000000
mean 0.497838 0.179371 -0.325661 -0.017590
std 0.317808 0.172135 1.079307 0.270343
min 0.004012 0.000018 -5.284000 -1.960000
25% 0.271600 0.060175 -0.406000 -0.150050
50% 0.424900 0.134200 -0.025360 -0.014280
75% 0.644100 0.244750 0.189900 0.116300
max 1.989000 1.795000 2.335000 2.008000
In [8]:
def overall_stats(df,effect,name):
    #Find quantiles
    quantile_2 = df['binding_median'].quantile(.02)
    quantile_98 = df['binding_median'].quantile(.98)
    print(f'The 2% quantile for {name} is: {quantile_2}')
    print(f'The 98% quantile for {name} is: {quantile_98}')

    #Now group sites and find intolerant sites 
    filtered_df = df.groupby('site').filter(lambda group: (group[effect] <-0.25).all())
    unique = filtered_df['site'].unique()
    # Convert unique to a Pandas Series
    unique_series = pd.Series(unique)
    #print(unique_series)
    # Find the common elements
    unique_contact_bool = unique_series.isin(config['contact_sites'])
    #print(unique_contact_bool)
    # Filter and get the common elements
    common_elements = unique_series[unique_contact_bool]

    # Print the common elements
    print(f'Here are the contact sites that are conserved: {common_elements}')
    
    print(f'There are {len(unique)} sites with all negative binding score mutants for {name}')
    print(f'These are the sites for {name} with all negative binding score mutants: {list(unique)}')

    #Now find sites with low and high binding (median)
    median_df = df.groupby('site')['binding_median'].median().reset_index().sort_values(by='binding_median')
    print(f'For {name}, these are the sites with lowest median binding scores: {median_df.head(5)}')
    median_df = df.groupby('site')['binding_median'].median().reset_index().sort_values(by='binding_median',ascending=False)
    print(f'For {name}, these are the sites with highest median binding scores: {median_df.head(5)}')
    
    #Now calculate mutant number
    total_mutants = df.shape[0]
    upper_cutoff = df[df[effect] > 1].sort_values(by='binding_median',ascending=False)
    median_upper = upper_cutoff['effect'].median()
    print(f'The median entry score for top binders was: {median_upper}')
    
    mutants_above_cutoff_tolerated = upper_cutoff[upper_cutoff['effect'] > 0]
    mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[['site','effect','binding_median','wildtype','mutant']]
    print(f'The mutants with positive entry scores and good binding are: {mutants_above_cutoff_tolerated.head(5)}')
    
    lower_cutoff = df[df[effect] < -1]
    
    print(f'For {name}, there are a total of : {total_mutants} binding mutants')
    print(f'For {name}, there are {upper_cutoff.shape[0]} mutants above cutoff, and {mutants_above_cutoff_tolerated.shape[0]} that have good entry scores')
    print(f'For {name}, there are {lower_cutoff.shape[0]} mutants below cutoff')
 
    total_sites = df['site'].unique().shape[0]
    
    print(f'The total number of sites are: {total_sites}')


overall_stats(df_E2_filter,'binding_median','E2')
overall_stats(df_E3_filter,'binding_median','E3')
The 2% quantile for E2 is: -3.8724000000000003
The 98% quantile for E2 is: 1.112
Here are the contact sites that are conserved: 5     242
11    389
23    488
24    490
25    491
29    501
30    504
31    505
33    531
34    532
35    533
36    557
37    579
38    581
41    588
dtype: int64
There are 44 sites with all negative binding score mutants for E2
These are the sites for E2 with all negative binding score mutants: [116, 206, 220, 236, 238, 242, 243, 248, 346, 351, 352, 389, 390, 398, 399, 400, 438, 439, 441, 460, 467, 486, 487, 488, 490, 491, 494, 495, 497, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590, 594]
For E2, these are the sites with lowest median binding scores:      site  binding_median
443   533          -4.045
407   494          -4.025
404   490          -3.941
402   487          -3.868
414   504          -3.820
For E2, these are the sites with highest median binding scores:      site  binding_median
133   208         1.38740
48    120         1.36300
59    132         1.24100
132   207         1.20775
250   331         1.12100
The median entry score for top binders was: -0.7596
The mutants with positive entry scores and good binding are:       site   effect  binding_median wildtype mutant
1324   139  0.00505           1.989        N      Y
5448   354  0.22810           1.498        S      T
9533   566  0.08127           1.415        F      H
7170   444  0.16280           1.256        I      F
1217   134  0.09948           1.249        S      I
For E2, there are a total of : 6657 binding mutants
For E2, there are 178 mutants above cutoff, and 22 that have good entry scores
For E2, there are 970 mutants below cutoff
The total number of sites are: 511
The 2% quantile for E3 is: -0.634456
The 98% quantile for E3 is: 0.601376
Here are the contact sites that are conserved: 2     389
4     488
8     501
9     531
10    532
dtype: int64
There are 11 sites with all negative binding score mutants for E3
These are the sites for E3 with all negative binding score mutants: [108, 352, 389, 486, 488, 494, 495, 497, 501, 531, 532]
For E3, these are the sites with lowest median binding scores:      site  binding_median
414   501        -0.91625
440   531        -0.72305
308   389        -0.68695
463   555        -0.65450
405   488        -0.64390
For E3, these are the sites with highest median binding scores:      site  binding_median
493   589         0.67100
85    159         0.56090
56    129         0.47550
59    132         0.46735
87    161         0.44775
The median entry score for top binders was: -0.7568
The mutants with positive entry scores and good binding are:        site   effect  binding_median wildtype mutant
7995    492  0.49590           1.200        Q      L
2632    211  0.03675           1.149        G      F
10015   598  0.34370           1.141        P      G
1655    161  0.23430           1.011        S      E
For E3, there are a total of : 6563 binding mutants
For E3, there are 25 mutants above cutoff, and 4 that have good entry scores
For E3, there are 11 mutants below cutoff
The total number of sites are: 507

Find sites with opposite effects on binding¶

In [9]:
#find sites that are different
def find_biggest_differences(df):
    df = df[['site','wildtype','mutant','binding_median_E2',
                 'LibA-231112-EFNB2-monomeric','LibA-231222-BA6-CHO-EFNB2-monomeric','LibB-231108-EFNB2-monomeric', 'LibB-231222-BA6-CHO-EFNB2-monomeric',
                 'effect_E2',
                 'binding_median_E3','LibA-230825-EFNB3-dimeric','LibB-230907-EFNB3-dimeric','effect_E3','effect_std_E3','times_seen_cell_entry_E3','binding_effect_diff']]
                 #,'binding_effect_diff']]
    
    df = df[df['site'].isin(config['contact_sites'])]
    efnb2_good_efnb3_bad = df[
        (df['binding_median_E2'] > 0.1) &
        (df['binding_median_E3'] < -0.1)
    ].sort_values(by='binding_median_E2',ascending=False)
    display(efnb2_good_efnb3_bad)
    efnb2_bad_efnb3_good = df[
        (df['binding_median_E2'] < -0.1) &
        (df['binding_median_E3'] > 0.1)
    ].sort_values(by='binding_median_E3',ascending=False)
    display(efnb2_bad_efnb3_good)


find_biggest_differences(df_affinity_filter_merge)
site wildtype mutant binding_median_E2 LibA-231112-EFNB2-monomeric LibA-231222-BA6-CHO-EFNB2-monomeric LibB-231108-EFNB2-monomeric LibB-231222-BA6-CHO-EFNB2-monomeric effect_E2 binding_median_E3 LibA-230825-EFNB3-dimeric LibB-230907-EFNB3-dimeric effect_E3 effect_std_E3 times_seen_cell_entry_E3 binding_effect_diff
1837 241 S L 0.8667 1.0410 0.61050 0.6921 1.2300 -0.4249 -0.1359 -0.1972 -0.07473 0.15840 0.2774 6.571 1.02940
1832 241 S F 0.6864 0.7802 0.58160 0.5925 1.7230 -0.3853 -0.1101 -0.4214 0.20110 -0.22970 0.2284 6.857 1.02940
1833 241 S G 0.1692 0.2446 0.09392 0.3091 -0.5815 -0.3386 -0.2740 -0.1346 -0.41340 -0.09642 0.2263 5.857 0.29052
site wildtype mutant binding_median_E2 LibA-231112-EFNB2-monomeric LibA-231222-BA6-CHO-EFNB2-monomeric LibB-231108-EFNB2-monomeric LibB-231222-BA6-CHO-EFNB2-monomeric effect_E2 binding_median_E3 LibA-230825-EFNB3-dimeric LibB-230907-EFNB3-dimeric effect_E3 effect_std_E3 times_seen_cell_entry_E3 binding_effect_diff
5266 492 Q L -0.1671 -0.06133 -0.27290 -0.73670 0.52310 0.51440 1.2000 1.22200 1.17800 0.4959 0.2963 4.857 1.3370
6489 588 I P -2.2390 -3.55400 -1.65200 -2.26100 -2.21700 -0.18690 1.0520 1.01300 1.09000 -0.6374 0.3687 5.000 3.4730
5269 492 Q R -2.9170 -3.51500 -2.60900 -2.15100 -3.22500 0.01257 0.6505 0.70400 0.59710 0.4092 0.2253 19.290 3.5255
5243 491 S A -0.9052 -1.08200 -0.72840 -0.40320 -1.52200 -0.59270 0.6041 0.57220 0.63610 0.2335 0.5233 6.143 1.5381
4014 402 R M -0.4290 -0.37740 -0.15310 -0.48070 -0.82860 -0.23740 0.3071 0.27330 0.34090 -0.1438 0.2759 5.857 0.7670
6350 580 I E -0.4390 0.88280 -0.45010 -0.42780 -0.89910 -0.57650 0.2554 0.15280 0.35790 0.5231 0.5856 5.143 0.4790
1816 239 S G -0.1330 -0.25570 -0.01024 -1.12400 0.90290 0.14500 0.2176 0.09886 0.33640 0.3443 0.3839 2.000 0.3395
6105 559 Q L -0.2814 -1.42100 -0.18540 -0.37740 -0.06623 -0.77730 0.1812 0.17160 0.19090 -0.5984 0.7335 4.571 0.6936
5235 490 Q L -1.4730 -1.50600 -1.44000 -1.57700 -0.91850 0.41890 0.1462 0.33720 -0.04486 0.5374 0.2450 5.857 1.5062
3834 388 Q Y -0.1516 -1.03200 0.04747 0.37960 -0.35070 0.50160 0.1412 0.14140 0.14100 0.5493 0.0976 5.143 0.3800
3833 388 Q W -0.2135 -0.46200 -0.06387 -0.01505 -0.36320 0.38560 0.1166 0.06678 0.16650 -0.2053 0.2456 6.429 0.3426
1863 242 R W -1.2840 -1.73800 -0.90460 -0.37760 -1.66300 0.09253 0.1141 -0.01441 0.24250 -0.4407 0.4435 6.286 1.2851
5419 507 V M -1.3570 -0.19970 -0.99620 -1.89000 -1.71900 0.41880 0.1024 0.53630 -0.33150 -0.4414 0.5062 5.143 1.3034

Find correlations between EFNB2 and EFNB3 binding¶

In [10]:
def plot_affinity_solid(df):
    slider = alt.binding_range(min=1, max=20, step=1, name="times_seen")
    selector = alt.param(name="SelectorName", value=1, bind=slider)
    df = df.dropna()
    # calculate r value
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df['binding_median_E2'], df['binding_median_E3'])
    r_value = float(r_value)
    # make chart
    chart = alt.Chart(df,title=alt.Title('Correlation between Mutant Binding Scores',subtitle=f'r={r_value:.2f}')).mark_point(color='black',size=30, opacity=0.2,filled=True).encode(
        x=alt.X('binding_median_E2', title=('Ephrin-B2 Binding')),
        y=alt.Y('binding_median_E3', title=('Ephrin-B3 Binding')),
        tooltip=['site', 'wildtype','mutant','binding_median_E2','binding_median_E3','effect_E2','effect_E3'],
    ).properties(
        width=300, 
        height=300
    ).add_params(selector).transform_filter(
            (alt.datum.times_seen_affinity_E2 >= selector)
    )
    min = int(df['binding_median_E2'].min())
    max = int(df['binding_median_E3'].max())
    text = alt.Chart({'values':[{'x': min, 'y': max, 'text': f'r = {r_value:.2f}'}]}).mark_text(
        align='left', baseline='top', dx=-10, dy=-20).encode(
            x=alt.X('x:Q'),
            y=alt.Y('y:Q'),
            text='text:N'
        )
    chart_and_text = chart #+ text
    return chart_and_text#.display()

E2_E3_corr = plot_affinity_solid(df_affinity_filter_merge)
E2_E3_corr.display()
E2_E3_corr.save(E2_E3_correlation)

Plot correlations between summary statistics for each site¶

In [11]:
def plot_affinity_solid_mean(df):
    df = df.dropna()
    means = df.groupby('site').agg({
            'effect_E2': 'median',
            'effect_E3': 'median',
            'binding_median_E2': 'median',
            'binding_median_E3': 'median',
            'wildtype': 'first'
        }).reset_index()
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(means['binding_median_E2'], means['binding_median_E3'])
    r_value = float(r_value)
    chart = alt.Chart(means,title=alt.Title('Correlation between Aggregate Mutant Binding Scores',subtitle=f'r={r_value:.2f}')).mark_point(size=50, color='black', opacity=0.3,filled=True).encode(
            x=alt.X('binding_median_E2', title=('Median Ephrin-B2 Binding'), axis=alt.Axis(tickCount=3)),
            y=alt.Y('binding_median_E3', title=('Median Ephrin-B3 Binding'), axis=alt.Axis(tickCount=3)),
            tooltip=['site', 'wildtype','binding_median_E2','binding_median_E3','effect_E2','effect_E3'],
        ).properties(
            width=300, 
            height=300
    )
    text = alt.Chart({'values':[{'x': -3.5, 'y': 0.5, 'text': f'r = {r_value:.2f}'}]}).mark_text(
        align='left', baseline='top', dx=0, dy=-10).encode(
            x=alt.X('x:Q'),
            y=alt.Y('y:Q'),
            text='text:N'
        )
    chart_and_text = chart #+ text
    return chart_and_text#.display()

E2_E3_site_corr = plot_affinity_solid_mean(df_affinity_filter_merge)
E2_E3_site_corr.display()
E2_E3_site_corr.save(E2_E3_correlation_site)

Make plot showing binding by site (median)¶

In [12]:
def plot_affinity_by_site_median(df):
    variant_selector = alt.selection_point(
        on="mouseover",
        empty=False,
        fields=["site"],
        value=0
    )  
    empty_charts = []
    for selection in ['binding_median_E2','binding_median_E3']:
        if selection == 'binding_median_E2':
            name = 'Ephrin-B2 Binding'
        else:
            name = 'Ephrin-B3 Binding'
        mean = df.groupby('site')[selection].sum().reset_index()
        chart = alt.Chart(mean).mark_point(size=60, color='black', stroke='black',filled=True).encode(
            x=alt.X('site', title=('Site'), axis=alt.Axis(grid=True, tickCount=4),scale=alt.Scale(domain=[70,602])),
            y=alt.Y(selection, title=(name), axis=alt.Axis(grid=True, tickCount=4)),
            tooltip=['site'],
            color=alt.condition(variant_selector, alt.value('orange'), alt.value('black')),
            opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.4)),
            strokeWidth=alt.condition(variant_selector,alt.value(2),alt.value(0))
        ).properties(
            width=700, 
            height=200
        ).add_params(variant_selector)
        empty_charts.append(chart)
    combined_chart = alt.vconcat(*empty_charts, spacing=1,title='Summed Binding by Site')
    return combined_chart



binding_by_site = plot_affinity_by_site_median(df_affinity_filter_merge)
binding_by_site.display()
binding_by_site.save(binding_by_site_plot)

Make bubble plots for binding in different areas of receptor pocket¶

In [13]:
def make_boxplot_binding_region(df,title):# Create a box plot using Altair for aggregated means
    barrel_ranges = {
    'Hydrophobic': config['hydrophobic'],
    'Salt Bridges': config['salt_bridges'],
    'Hydrogen Bonds': config['h_bond_total'],
    'Contact': config['contact_sites'],
    'Overall': list(range(71,602)),
    }
    
    mean_df = df.groupby('site')[['binding_median']].median().reset_index()
    custom_order = ['Hydrophobic','Salt Bridges','Hydrogen Bonds','Contact','Overall']
    agg_means = []
    
    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = mean_df[mean_df['site'].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append({'barrel': barrel, 'effect': row['binding_median'],'site':row['site']})
        agg_means_df = pd.DataFrame(agg_means)
    chart = alt.Chart(agg_means_df).mark_point(filled=True,size=70,opacity=0.4,color='black').encode(
                x=alt.X('barrel:O', sort=custom_order,title=None,axis=alt.Axis(labelAngle=-90)),
                y=alt.Y('effect',title=f'Median {title} Binding',axis=alt.Axis(grid=True,tickCount=4)),
                xOffset='random:Q',
                color = alt.Color('barrel').legend(None),
                tooltip=['barrel', 'effect','site'],
            ).transform_calculate(
                random="sqrt(-1*log(random()))*cos(2*PI*random())"
        
            ).properties(
                height=alt.Step(20),
                width=alt.Step(25)
            )
    
    return chart.display()

make_boxplot_binding_region(df_E2_filter,'EFNB2')
make_boxplot_binding_region(df_E3_filter,'EFNB3')

Make lineplot by site showing mean binding affinity¶

Plot histogram¶

In [14]:
def effect_histogram(df):
    colors = {'E2': '#1f4e79', 'E3': '#ff7f0e'}
    all_charts = []
    for effect in ['E2','E3']:
        func_effect_container = []
        for func_effect in [-2]:# Melt the dataframe for the specific columns
            df = df[
                (df[f'effect_{effect}'] > func_effect)
            ]
            color = colors[effect]    
            df_melted = df.melt(value_vars=['binding_median_E2', 'binding_median_E3'], var_name='Effect', value_name='Value')
        
            # Histogram for 'effect_E2'
            histogram = alt.Chart(df_melted[df_melted['Effect'] == f'binding_median_{effect}']).mark_bar(opacity=1,color=color).encode(
                x=alt.X('Value', bin=alt.Bin(step=0.1), title=f'Binding {effect}',axis=alt.Axis(tickCount=4,values=[3,1,0,-1,-5]),scale=alt.Scale(domain=[-6,3])),
                y=alt.Y('count()',stack=None,title='Count'),
                #color=alt.Color('red'),
                tooltip=['Effect', 'count()']
            ).properties(
                width=150, 
                height=alt.Step(10)
            )
            func_effect_container.append(histogram)
        combined_effect_chart = alt.hconcat(*func_effect_container).resolve_scale(y='shared', x='shared', color='independent')
        all_charts.append(combined_effect_chart)
    final_combined_chart = alt.vconcat(*all_charts).resolve_scale(y='independent', x='independent', color='independent')
    return final_combined_chart.display()
    
effect_histogram(df_affinity_filter_merge)
#effect_histogram(df_affinity_filter_merge,'#ff7f0e','binding_mean_E3')

Make binding score heatmaps¶

First, prep data for heatmap¶

In [15]:
entropy_df = pd.read_csv(entropy_file)
entropy_df = entropy_df[['site','henipavirus_entropy']]
entropy_df = entropy_df.dropna(subset=['site'])
entropy_df['site'] = entropy_df['site'].astype('Int64')
entropy_df = entropy_df.rename(columns={'henipavirus_entropy':'entropy'})
entropy_df = entropy_df[['site','entropy']].drop_duplicates()
entropy_df['mutant'] = 'entropy'
entropy_df['wildtype'] = ''  
entropy_df['effect'] = 'effect'
entropy_df.rename(columns={'entropy': 'value'}, inplace=True)
display(entropy_df.head(3))

def make_contact():
    df = pd.DataFrame({
    'site': config['contact_sites'],
    'contact': [0.0] * len(config['contact_sites'])
    })
    df = df[['site','contact']]
    df['mutant'] = 'contact'
    df['wildtype'] = ''
    df['effect'] = 'binding_median'
    df.rename(columns={'contact':'value'}, inplace=True)
    return df
        
contact_df = make_contact()
display(contact_df.head(3))

def make_empty_df_with_filtered(df,filter_df):
    sites = range(71, 603)
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    # Create the combination of each site with each amino acid
    data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
    # Create the DataFrame
    empty_df = pd.DataFrame(data)
    all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
    df_melted = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
                                 value_vars=['binding_median'], 
                                 var_name='effect', value_name='value')

    df_filtered = filter_df.melt(id_vars=['site', 'mutant', 'wildtype'],
                                 value_vars=['effect'], 
                                 var_name='effect', value_name='value')
    
    df_test = pd.concat([df_melted,df_filtered,contact_df],ignore_index=True)
    
    return df_test

empty_df_E2 = make_empty_df_with_filtered(df_E2_filter,df_E2_filter_missing)
empty_df_E3 = make_empty_df_with_filtered(df_E3_filter,df_E3_filter_missing)
site value mutant wildtype effect
0 1 -0.00 entropy effect
1 2 1.75 entropy effect
2 3 1.75 entropy effect
site value mutant wildtype effect
0 239 0.0 contact binding_median
1 240 0.0 contact binding_median
2 241 0.0 contact binding_median
In [16]:
def plot_heatmap_full(df,legend_title):
    # Create y-axis order list
    custom_order = ['contact','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C']    #custom_order = names + custom_order

    final_df = df

    full_ranges = [list(range(start, end)) for start, end in [(71, 204), (204, 337), (337, 470), (470, 603)]]
    
    # Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
    final_df = final_df.sort_values('site')
    sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
    final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
    # Now sort the dataframe by this rank
    final_df = final_df.sort_values('mutant_rank')
    # Drop the 'mutant_rank' column as it is no longer needed after sorting
    final_df = final_df.drop(columns=['mutant_rank'])
    sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
    if legend_title == 'Ephrin-B2 Binding':
        color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-6,2.5]) #EFNB3 is -1.5,2, EFNB2 is -6,3
    else:
        color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-2,2]) #EFNB3 is -1.5,2, EFNB2 is -6,3
    color_scale_entropy = alt.Scale(scheme='teals', domain=[0, 1],reverse=True)
    color_scale_contact = alt.Scale(scheme='purples', domainMid=0, domain=[0, 5],reverse=True)
    color_scale_features = alt.Scale(scheme='greys', domainMid=0, domain=[0, 5],reverse=True)

    strokewidth_size = 0.25
    
    effect_legend_added = False
    charts = [] # container to hold the charts
    for idx, subset in enumerate(full_ranges): 
        subset_df = final_df[final_df['site'].isin(subset)] #for the wrapping of sites
        unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype']) #for the wildtype mapping
        is_last_plot = idx == len(full_ranges) - 1
        x_axis = alt.Axis(labelAngle=-90,
                        labelExpr="datum.value % 10 === 0 ? datum.value : ''",
                        title="Site" if is_last_plot else None,
                        labels=True)  # Only show labels for the last plot

    
    #for subset in full_ranges: 
        #subset_df = final_df[final_df['site'].isin(subset)]
        #unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype'])
        # The chart for the heatmap
        base = (
            alt.Chart(subset_df)
            .encode(
                x=alt.X('site:O', title='Site', sort=sites, axis=x_axis),
                y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending'),axis=alt.Axis(grid=False)),
                tooltip=['site','wildtype','mutant','value'],
            )
            
            .properties(
                width=alt.Step(10),
                height=alt.Step(11) 
            )
        )
        chart_empty = (
            base.mark_rect(color='#d1d3d4').encode().transform_filter(
                (alt.datum.effect == 'binding_median') 
            )
        )
        # Heatmap for effect
        if not effect_legend_added:
            chart_effect = (
                base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
                color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
                alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=legend_title)), 
                alt.value('transparent')),
                ).transform_filter(
                    alt.datum.effect == 'binding_median'
                )
            )
            effect_legend_added = True
        else:
            chart_effect = (
                base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
                color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
                alt.Color('value:Q', scale=color_scale_effect,legend=None), 
                alt.value('transparent')),
                ).transform_filter(
                    alt.datum.effect == 'binding_median'
                )
            )
        chart_filtered = (
            base.mark_rect(color='#939598',stroke='black',strokeWidth = strokewidth_size).encode(
            ).transform_filter(
                alt.datum.effect == 'effect'
            )
        )
        chart_contact = (
            base.mark_rect(color='black').encode(
            #color=alt.condition(f'datum.mutant != "entropy"',
            #alt.Color('value:Q', scale=color_scale_contact,legend=None), 
            #alt.value('transparent')),
            ).transform_filter(
                alt.datum.mutant == 'contact'
            )
        )
        # The layer for the wildtype boxes
        wildtype_layer_box = (
            alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
                x=alt.X('site:O', sort=sites),
                y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
                opacity=alt.value(1)  
            )
            .transform_filter(
                (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
            )
        )
        # The layer for the wildtype amino acids
        wildtype_layer = (
            alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8, align='center', baseline='middle').encode(
                x=alt.X('site:O', sort=sites),
                y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
                opacity=alt.value(1)  
            )
            .transform_filter(
                (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
            )
        )    
        # Combine the heatmap layer with the wildtype layer
        chart = alt.layer(chart_empty, chart_effect,chart_filtered,chart_contact, wildtype_layer_box,wildtype_layer).resolve_scale(color='independent',y='shared')
        
        charts.append(chart)
    
    combined_chart = alt.vconcat(*charts,spacing=3,title=f'{legend_title}').resolve_scale(y='independent', x='independent',color='shared')
    return combined_chart#.display()


E2_binding_plot = plot_heatmap_full(empty_df_E2,'Ephrin-B2 Binding')
E2_binding_plot.display()
E2_binding_plot.save(E2_binding_heatmap)
E3_binding_plot = plot_heatmap_full(empty_df_E3,'Ephrin-B3 Binding')
E3_binding_plot.display()
E3_binding_plot.save(E3_binding_heatmap)

Now prep data for contact site heatmaps¶

In [17]:
entropy_df = pd.read_csv(entropy_file)
entropy_df = entropy_df[['site','henipavirus_entropy']]
entropy_df = entropy_df.dropna(subset=['site'])
entropy_df['site'] = entropy_df['site'].astype('Int64')
entropy_df = entropy_df.rename(columns={'henipavirus_entropy':'entropy'})
entropy_df = entropy_df[['site','entropy']].drop_duplicates()
entropy_df['mutant'] = 'entropy'
entropy_df['wildtype'] = ''  
entropy_df['effect'] = 'effect'
entropy_df.rename(columns={'entropy': 'value'}, inplace=True)
#display(entropy_df.head(3))

def make_contact(list_,name_):
    df = pd.DataFrame({
    'site': list_,
    name_: [0.0] * len(list_)
    })
    df = df[['site',name_]]
    df['mutant'] = name_
    df['wildtype'] = ''
    df['effect'] = 'binding_median'
    df.rename(columns={name_:'value'}, inplace=True)
    return df
        
contact_df = make_contact(config['contact_sites'],'contact')
#hydrophobic_df = make_contact(hydrophobic,'hydrophobic')
#salt_bridge_df = make_contact(salt_bridges,'salt bridge')
#h_bond_df = make_contact(h_bond_total,'hydrogen bond')


#display(contact_df.head(3))

def make_empty_df_with_filtered(df,filtered_df):
    sites = range(71, 603)
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    # Create the combination of each site with each amino acid
    data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
    # Create the DataFrame
    empty_df = pd.DataFrame(data)
    all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
    df_melted = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
                                 value_vars=['binding_median'], 
                                 var_name='effect', value_name='value')

    df_filtered = filtered_df.melt(id_vars=['site', 'mutant', 'wildtype'],
                                 value_vars=['effect'], 
                                 var_name='effect', value_name='value')
    
    df_test = pd.concat([df_melted,df_filtered],ignore_index=True) #contact_df
    
    return df_test

empty_df_E2 = make_empty_df_with_filtered(df_E2_filter,df_E2_filter_missing)
empty_df_E3 = make_empty_df_with_filtered(df_E3_filter,df_E3_filter_missing)
In [ ]:
 
In [18]:
def plot_heatmap_contact(df,subset,name):
    # Create y-axis order list
    custom_order = ['R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C']    #custom_order = names + custom_order

    final_df = df

    full_ranges = [list(range(start, end)) for start, end in [(71, 204), (204, 337), (337, 470), (470, 603)]]
    
    # Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
    final_df = final_df.sort_values('site')
    sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
    final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
    # Now sort the dataframe by this rank
    final_df = final_df.sort_values('mutant_rank')
    # Drop the 'mutant_rank' column as it is no longer needed after sorting
    final_df = final_df.drop(columns=['mutant_rank'])
    sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
    if name == 'Ephrin-B2 Binding':
        color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-6,2.5]) 
    else:
        color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-2,1.5]) 
    color_scale_entropy = alt.Scale(scheme='teals', domain=[0, 1],reverse=True)
    color_scale_contact = alt.Scale(scheme='teals', domainMid=0, domain=[0, 5],reverse=True)
    color_scale_features = alt.Scale(scheme='greys', domainMid=0, domain=[0, 5],reverse=True)
    
    
    subset_df = final_df[final_df['site'].isin(subset)]
    unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype'])
    strokewidth_size = 0.25
    
    charts = [] # container to hold the charts
    # The chart for the heatmap
    base = (
        alt.Chart(subset_df)
        .encode(
            x=alt.X('site:O', title='Contact Site', sort=sites, axis=alt.Axis(labelAngle=-90,grid=False)),
            y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending'),axis=alt.Axis(grid=False)),
            tooltip=['site','wildtype','mutant','value'],
        )
        
        .properties(
            width=alt.Step(10),
            height=alt.Step(11) 
        )
    )
    chart_empty = (
        base.mark_rect(color='#e6e7e8').encode().transform_filter(
            (alt.datum.effect == 'binding_median') 
        )
    )
    # Heatmap for effect
    chart_effect = (
        base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
        color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
        alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title='binding')), 
        alt.value('transparent')),
        ).transform_filter(
            alt.datum.effect == 'binding_median'
        )
    )
    chart_filtered = (
        base.mark_rect(color='#939598',stroke='black',strokeWidth = strokewidth_size).encode(
        ).transform_filter(
            alt.datum.effect == 'effect'
        )
    )
    chart_contact = (
        base.mark_rect().encode(
        color=alt.condition(f'datum.mutant != "entropy"',
        alt.Color('value:Q', scale=color_scale_contact,legend=None), 
        alt.value('transparent')),
        ).transform_filter(
            (alt.datum.mutant == 'hydrophobic') | (alt.datum.mutant == 'salt bridge') | (alt.datum.mutant == 'hydrogen bond')
        )
    )
    # The layer for the wildtype boxes
    wildtype_layer_box = (
        alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
            x=alt.X('site:O', sort=sites),
            y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            opacity=alt.value(1)  
        )
        .transform_filter(
            (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
        )
    )
    # The layer for the wildtype amino acids
    wildtype_layer = (
        alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8, align='center', baseline='middle').encode(
            x=alt.X('site:O', sort=sites),
            y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            opacity=alt.value(1)  
        )
        .transform_filter(
            (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
        )
    )    
    # Combine the heatmap layer with the wildtype layer
    chart = alt.layer(chart_empty, chart_effect,chart_filtered,wildtype_layer_box,wildtype_layer).resolve_scale(color='independent') #chart_contact, 
    
    charts.append(chart)

    combined_chart = alt.vconcat(*charts,title=f'{name}').resolve_scale(y='independent', x='independent',color='shared')
    return combined_chart.display()

plot_heatmap_contact(empty_df_E2,config['contact_sites'],'Ephrin-B2 Binding')
plot_heatmap_contact(empty_df_E3,config['contact_sites'],'Ephrin-B3 Binding')

Plot heatmap of sites with mutant effects different between amino acid types¶

In [19]:
def check_muts_by_properties(df,min_group_size,positive_value,negative_value):
    letter_to_type = {'D': 'negative', 
                      'E': 'negative', 
                      'K': 'positive', 
                      'R': 'positive', 
                      'H': 'positive', 
                      'C': 'special',
                      'S': 'hydrophilic', 
                      'T': 'hydrophilic',
                      'N': 'hydrophilic', 
                      'Q': 'hydrophilic',
                      'G': 'special',
                      'A': 'hydrophobic', 
                      'V': 'hydrophobic',
                      'L': 'hydrophobic',
                      'I': 'hydrophobic', 
                      'M': 'hydrophobic',
                      'P': 'special',
                      'F': 'aromatic', 
                      'Y': 'armomatic', 
                      'W': 'aromatic', 
                      '*': 'stop'}

    df['mutant_type'] = df['mutant'].map(letter_to_type)
    #display(df)
    #df = df[df['mutant_type'] != 'special']
    #min_group_size = 3  # Set your minimum group size
    grouped = df.groupby(['site', 'mutant_type'])
    filtered_groups = grouped.filter(lambda x: (x['binding_median'] > positive_value).all() and len(x) >= min_group_size)

    positive_mask = (
        filtered_groups.groupby('site')['binding_median'].transform(lambda x: (x > positive_value).all()) &
        df.groupby('site')['binding_median'].transform(lambda x: (x <= positive_value).any())
    )
    positive_sites = df[positive_mask]
    
    filtered_groups_neg = grouped.filter(lambda x: (x['binding_median'] < negative_value).all() and len(x) >= min_group_size)
    
    negative_mask = (
        filtered_groups_neg.groupby('site')['binding_median'].transform(lambda x: (x < negative_value).all()) &
        df.groupby('site')['binding_median'].transform(lambda x: (x <= negative_value).any())
    )
    negative_sites = df[negative_mask]
    
    common_sites = pd.merge(positive_sites, negative_sites, on='site', how='inner')
    final_sites = common_sites['site'].drop_duplicates()
    return final_sites
final_sites_E2 = check_muts_by_properties(df_E2_filter,1,0,-0.5)
final_sites_E3 = check_muts_by_properties(df_E3_filter,1,0,-0.5)
print(list(final_sites_E2))
[153, 169, 202, 214, 215, 232, 233, 235, 241, 244, 245, 246, 247, 250, 268, 271, 284, 305, 309, 310, 319, 337, 347, 362, 378, 388, 394, 404, 406, 409, 440, 458, 463, 479, 482, 508, 511, 525, 529, 555, 577, 578, 591, 593]
In [20]:
def plot_heatmap_pref_switch(df,subset,name):
    # Create y-axis order list
    custom_order = ['contact','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C']    #custom_order = names + custom_order

    final_df = df

    full_ranges = [list(range(start, end)) for start, end in [(71, 204), (204, 337), (337, 470), (470, 603)]]
    
    # Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
    final_df = final_df.sort_values('site')
    sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
    final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
    # Now sort the dataframe by this rank
    final_df = final_df.sort_values('mutant_rank')
    # Drop the 'mutant_rank' column as it is no longer needed after sorting
    final_df = final_df.drop(columns=['mutant_rank'])
    sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
    if name == 'EFNB2 Binding':
        color_scale_effect = alt.Scale(scheme='redblue', domainMid=0) #EFNB3 is -1.5,2, EFNB2 is -6,3
    else:
        color_scale_effect = alt.Scale(scheme='redblue', domainMid=0) #EFNB3 is -1.5,2, EFNB2 is -6,3
    color_scale_entropy = alt.Scale(scheme='teals', domain=[0, 1],reverse=True)
    color_scale_contact = alt.Scale(scheme='teals', domainMid=0, domain=[0, 5],reverse=True)
    color_scale_features = alt.Scale(scheme='greys', domainMid=0, domain=[0, 5],reverse=True)
    subset_df = final_df[final_df['site'].isin(subset)]
    unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype'])
    strokewidth_size = 0.25
    
    charts = [] # container to hold the charts
    # The chart for the heatmap
    base = (
        alt.Chart(subset_df)
        .encode(
            x=alt.X('site:O', title=None, sort=sites, axis=alt.Axis(labelAngle=-90,grid=False)),
            y=alt.Y('mutant', title=None, sort=alt.EncodingSortField(field='sort_order', order='ascending'),axis=alt.Axis(grid=False)),
            tooltip=['site','wildtype','mutant','value'],
        )
        
        .properties(
            width=alt.Step(10),
            height=alt.Step(11) 
        )
    )
    chart_empty = (
        base.mark_rect(color='#e6e7e8').encode().transform_filter(
            (alt.datum.effect == 'binding_median') 
        )
    )
    # Heatmap for effect
    chart_effect = (
        base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
        color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
        alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=name)), 
        alt.value('transparent')),
        ).transform_filter(
            alt.datum.effect == 'binding_median'
        )
    )
    chart_filtered = (
        base.mark_rect(color='#939598',stroke='black',strokeWidth = strokewidth_size).encode(
        ).transform_filter(
            alt.datum.effect == 'effect'
        )
    )
    chart_contact = (
        base.mark_rect().encode(
        color=alt.condition(f'datum.mutant != "entropy"',
        alt.Color('value:Q', scale=color_scale_contact,legend=None), 
        alt.value('transparent')),
        ).transform_filter(
            (alt.datum.mutant == 'hydrophobic') | (alt.datum.mutant == 'salt bridge') | (alt.datum.mutant == 'hydrogen bond')
        )
    )
    # The layer for the wildtype boxes
    wildtype_layer_box = (
        alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
            x=alt.X('site:O', sort=sites),
            y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            opacity=alt.value(1)  
        )
        .transform_filter(
            (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
        )
    )
    # The layer for the wildtype amino acids
    wildtype_layer = (
        alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8, align='center', baseline='middle').encode(
            x=alt.X('site:O', sort=sites),
            y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            opacity=alt.value(1)  
        )
        .transform_filter(
            (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
        )
    )    
    # Combine the heatmap layer with the wildtype layer
    chart = alt.layer(chart_empty, chart_effect,chart_filtered,chart_contact, wildtype_layer_box,wildtype_layer).resolve_scale(color='independent')
    
    charts.append(chart)

    combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='shared')
    return combined_chart.display()
cysteine_neck = [146, 158, 162]
cysteine_1 = [189,601]
cysteine_2 = [216, 240]
cysteine_3 = [282,295]
cysteine_4 = [382, 395]
cysteine_5 = [387, 499]
cysteine_6 = [493, 503]
cysteine_7 = [565, 574]

cysteine = cysteine_neck + cysteine_1 + cysteine_2 + cysteine_3 + cysteine_4 + cysteine_5 + cysteine_6 + cysteine_7
#print(cysteine)

n_linked = [159,306,378,417,481,529]
#n_linked_removal = [161,308,380,419,483,531]
stalk = list(range(96, 147))
neck = list(range(148,166))
linker = list(range(166,177))
#PNLG = [164, 180, 187, 191, 195, 275, 288, 311, 326, 359, 403, 423, 430, 473, 478, 543, 554, 570, 600]

total_list = [n_linked,stalk, neck, linker,cysteine]

for i in total_list:
    plot_heatmap_pref_switch(empty_df_E2,i,'EFNB2 Binding')
    plot_heatmap_pref_switch(empty_df_E3,i,'EFNB3 Binding')

Now plot heatmaps depending on wildtype amino acid property¶

In [21]:
hydrophobic_AA = ['A','V','L','I','M']
aromatic_AA = ['Y','W','F']
positive_AA = ['K','R','H']
negative_AA = ['E','D']
hydrophilic_AA = ['S','T','N','Q']

def find_aa_wildtype_sites(df,aa_type):
    aa_list = list(df[df['wildtype'].isin(aa_type)]['site'].unique())
    return aa_list


hydrophobic_AA_list = find_aa_wildtype_sites(df_E2_filter,hydrophobic_AA)
aromatic_AA_list = find_aa_wildtype_sites(df_E2_filter,aromatic_AA)
positive_AA_list = find_aa_wildtype_sites(df_E2_filter,positive_AA)
negative_AA_list = find_aa_wildtype_sites(df_E2_filter,negative_AA)
hydrophilic_AA_list = find_aa_wildtype_sites(df_E2_filter,hydrophilic_AA)

all_AA = [hydrophobic_AA_list,aromatic_AA_list,positive_AA_list,negative_AA_list,hydrophilic_AA_list]

for i in all_AA:
    plot_heatmap_pref_switch(empty_df_E2,i,'EFNB2 Binding')
    plot_heatmap_pref_switch(empty_df_E3,i,'EFNB3 Binding')
In [ ]: